In [85]:
df = pd.read_csv('train.csv')
In [86]:
#이름 encoding
df.Name = df.Name.fillna(0)
df.Name[df.Name!=0] = 1
In [87]:
#type encoding 강아지면 1 고양이는 0
df.AnimalType = df.AnimalType.apply(lambda x: 1 if x=='Dog' else 0)
In [88]:
def check_over13years(x):
if len(x)>7:
if x[-5:] == 'years' and int(x[:-5])>=13:
return 'over 13 years'
return x
In [89]:
# ageuponoutcome. 13년 이상된 강아지는 13년 이상으로 통일시키자
# 데이터가 없는 건 0 years랑 굉장히 유사한 데이터. 통일시키기!
df.AgeuponOutcome = df.AgeuponOutcome.fillna('0 years')
df.AgeuponOutcome = df.AgeuponOutcome.apply(check_over13years)
In [90]:
#필요없는 column제거
df = df.drop(['AnimalID', 'OutcomeSubtype'], axis=1)
In [91]:
#종 분류. 아래 종만 사용!
breeds = ['Labrador Retriever',
'German Shepherd',
'Golden Retriever',
'Beagle',
'Bulldog',
'Yorkshire Terrier',
'Boxer',
'Poodle',
'Rottweiler',
'Siberian Husky',
'Maltese',
'Persian',
'Maine Coon',
'Siamese',
'American Shorthair',
'Swedish Vallhund',
'Finnish',
'Catahoula',
'Ridgeback',
'Carolina',
'Manx',
'Domestic Shorthair',
'Pit Bull',
'Chihuahua',
'Domestic Medium Hair',
'Domestic Longhair',
'Dachshund',
'Rat Terrier',
'Miniature Schnauzer',
'Cairn Terrier',
'Shih Tzu']
In [92]:
def check_in_breeds(x):
for breed in breeds:
if x.count(breed) > 0:
return breed
In [93]:
# 원하는 종이면 그대로 두고 아니면 None으로 두기
df.Breed = df.Breed.apply(check_in_breeds)
In [94]:
## 색 개수 30개 이하인건 others로 빼기
list_color = list(df.Color)
list_color_over50 = []
for color in set(list_color):
if list_color.count(color) >= 50:
list_color_over50.append(color)
In [95]:
print('색 종류 ->',len(set(list_color)))
print('샘플이 50개 이상인 색 종류 -> ', len(list_color_over50))
In [96]:
def check_in_colors(x):
if x in list_color_over50:
return x
In [97]:
# 샘플이 50개 이상인 색이면 그대로 두고 아니면 None으로 채우기
df.Color = df.Color.apply(check_in_colors)
In [98]:
#시간 분류
df['hour'] = df.DateTime.apply(lambda x:x[11:13])
In [99]:
# 5~8까지 묶음, 20~22까지 묶음, 23~0 묶음, 나머지 그대로
def check_hour(x):
if x in ['03', '05', '06', '07']:
return '5_8'
elif x in ['20', '21', '22']:
return '20_22'
elif x in ['23', '00']:
return '23_0'
else:
return x
In [100]:
df.hour = df.hour.apply(check_hour)
In [101]:
df.hour.unique()
Out[101]:
In [102]:
X = df.drop('OutcomeType', axis=1)
In [103]:
y = df.OutcomeType
In [104]:
X_dummy = X_dummy = pd.get_dummies(X.ix[:, ['SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'hour']])
In [105]:
X_dummy['Name'] = X.Name
X_dummy['AnimalType'] = X.AnimalType
In [106]:
from sklearn.cross_validation import train_test_split
In [107]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.20, random_state=42)
In [108]:
from sklearn.ensemble import RandomForestClassifier
In [109]:
# using RandomForest
model_rf = RandomForestClassifier(n_estimators=30)
result_rf = model_rf.fit(X_train, y_train)
result_rf.score(X_test, y_test)
Out[109]:
In [110]:
df_importance = pd.DataFrame(zip(X_dummy.columns, model_rf.feature_importances_), columns=['colname', 'importance'])
df_importance.sort_values('importance', ascending=False)
Out[110]:
In [111]:
print(metrics.classification_report(y_test, model_rf.predict(X_test)))
In [112]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, model_rf.predict(X_test))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ RandomForest')
plt.colorbar()
outcomes = sorted(y_test.unique())
tick_marks = np.arange(len(set(list(y_test))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[112]:
In [113]:
model_rf.predict_proba(X_test)
Out[113]:
In [114]:
from sklearn.linear_model import LogisticRegression
In [115]:
from sklearn import metrics
In [116]:
model_lr = LogisticRegression(C=1e5).fit(X_train, y_train)
print(metrics.classification_report(y_test, model_lr.predict(X_test)))
In [117]:
model_lr.score(X_test, y_test)
Out[117]:
In [118]:
cm = confusion_matrix(y_test, model_lr.predict(X_test))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ Logistic Regression')
plt.colorbar()
outcomes = sorted(y_test.unique())
tick_marks = np.arange(len(set(list(y_test))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[118]:
In [119]:
model_lr.predict_proba(X_test)
Out[119]:
In [121]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
In [122]:
lasso1 = Lasso(alpha = 0).fit(X_train, y_train)
In [123]:
Lasso?
In [ ]:
In [65]:
from sklearn.svm import SVC
In [66]:
model_svc = SVC(probability=True).fit(X_train, y_train)
In [68]:
print(metrics.classification_report(y_test, model_svc.predict(X_test)))
In [69]:
model_svc.score(X_test, y_test)
Out[69]:
In [71]:
cm = confusion_matrix(y_test, model_svc.predict(X_test))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ SVM')
plt.colorbar()
outcomes = sorted(y_test.unique())
tick_marks = np.arange(len(set(list(y_test))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[71]:
In [85]:
from sklearn.ensemble import VotingClassifier
In [84]:
clf1 = LogisticRegression(C=1e5, random_state=213)
clf2 = RandomForestClassifier(n_estimators=30, random_state=123)
clf3 = SVC(probability=True)
In [86]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
In [93]:
# 오래걸리니까 잠시후에 해보자
eclf.fit(X_train, y_train)
Out[93]:
In [94]:
eclf.score(X_test, y_test)
Out[94]:
In [95]:
predict_eclf = eclf.predict(X_test)
In [96]:
print(metrics.classification_report(y_test, predict_eclf))
In [99]:
cm = confusion_matrix(y_test, eclf.predict(X_test))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
outcomes = sorted(y_test.unique())
tick_marks = np.arange(len(set(list(y_test))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[99]:
In [105]:
eclf_kaggle = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
In [109]:
X_dummy = X_dummy.drop('OutcomeType', axis=1)
In [110]:
eclf_kaggle.fit(X_dummy, y)
Out[110]:
In [112]:
df_test = pd.read_csv('test.csv')
df_test = df_test.drop(['ID'], axis=1)
#이름 encoding
df_test.Name = df_test.Name.fillna(0)
df_test.Name[df_test.Name!=0] = 1
#type encoding 강아지면 1 고양이는 0
df_test.AnimalType = df_test.AnimalType.apply(lambda x: 1 if x=='Dog' else 0)
# ageuponoutcome. 13년 이상된 강아지는 13년 이상으로 통일시키자
df_test.AgeuponOutcome = df_test.AgeuponOutcome.fillna('0 years')
df_test.AgeuponOutcome = df_test.AgeuponOutcome.apply(check_over13years)
#종
df_test.Breed = df_test.Breed.apply(check_in_breeds)
#색
df_test.Color = df_test.Color.apply(check_in_colors)
#시간
df_test['hour'] = df_test.DateTime.apply(lambda x:x[11:13])
df_test.hour = df_test.hour.apply(check_hour)
df_test = df_test.drop('DateTime', axis=1)
In [113]:
X_test_dummy = pd.get_dummies(df_test.ix[:, ['SexuponOutcome', 'AgeuponOutcome',
'Breed', 'Color', 'hour']])
In [114]:
X_test_dummy['Name'] = df_test.Name
X_test_dummy['AnimalType'] = df_test.AnimalType
In [115]:
print(X_test_dummy.columns[130:150])
print(X_dummy.columns[130:150])
In [154]:
result_predict = eclf_kaggle.predict(X_test_dummy)
In [155]:
df_result = pd.DataFrame(columns=['ID','Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
df_result['ID'] = range(1, len(result_predict)+1)
count = 1
for predict_val in result_predict:
df_result.loc[df_result.ID==count, predict_val] = 1
count+=1
df_result = df_result.fillna(0)
df_result.index = df_result.ID
df_result = df_result.drop('ID', axis=1)
df_result.to_csv('submission2.csv')
In [116]:
model_rf = RandomForestClassifier()
model_rf.fit(X_dummy, y)
model_lf = LogisticRegression()
model_lf.fit(X_dummy, y)
result_lf = model_lf.predict_proba(X_test_dummy)
result_rf = model_rf.predict_proba(X_test_dummy)
rs = (result_rf*result_lf)
np.argmax(rs[1])
model_rf.predict(X_test)
model_rf.predict_proba(X_test)
result_dict = {}
result_dict[0] = 'Adoption'
result_dict[1] = 'Died'
result_dict[2] = 'Euthanasia'
result_dict[3] = 'Return_to_owner'
result_dict[4] = 'Transfer'
rs2 = []
for i in rs.argmax(axis=1):
rs2.append(result_dict[i])
In [118]:
df_result = pd.DataFrame(columns=['ID','Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
df_result['ID'] = range(1, len(rs)+1)
count = 1
for predict_val in rs2:
df_result.loc[df_result.ID==count, predict_val] = 1
count+=1
df_result = df_result.fillna(0)
df_result.index = df_result.ID
df_result = df_result.drop('ID', axis=1)
df_result.to_csv('submission2.csv')
In [72]:
# df2는 outcometype이 died인 샘플을 제외한 dataframe.
df2 = X_dummy
df2['OutcomeType'] = y
In [74]:
df2 = df2[df2.OutcomeType != 'Died']
In [76]:
X_dummy2 = df2.drop('OutcomeType', axis=1)
In [78]:
y2 = df2.OutcomeType
In [80]:
# train test split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_dummy2, y2, test_size=0.20, random_state=42)
In [81]:
# using RandomForest
model_rf2 = RandomForestClassifier(n_estimators=30)
model_rf2.fit(X_train2, y_train2)
model_rf2.score(X_test2, y_test2)
Out[81]:
In [83]:
print(metrics.classification_report(y_test2, model_rf2.predict(X_test2)))
In [90]:
cm = confusion_matrix(y_test2, model_rf2.predict(X_test2))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ randomforest')
plt.colorbar()
outcomes = sorted(y_test2.unique())
tick_marks = np.arange(len(set(list(y_test2))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[90]:
In [87]:
model_lr2 = LogisticRegression(C=1e5).fit(X_train2, y_train2)
print(metrics.classification_report(y_test2, model_lr2.predict(X_test2)))
In [89]:
model_lr2.score(X_test, y_test)
Out[89]:
In [91]:
cm = confusion_matrix(y_test2, model_lr2.predict(X_test2))
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ logistic regression')
plt.colorbar()
outcomes = sorted(y_test2.unique())
tick_marks = np.arange(len(set(list(y_test2))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[91]:
In [98]:
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
In [100]:
eclf2.fit(X_train2, y_train2)
Out[100]:
In [101]:
eclf2.score(X_test2, y_test2)
Out[101]:
In [102]:
predict_eclf2 = eclf2.predict(X_test2)
In [103]:
print(metrics.classification_report(y_test2, predict_eclf2))
In [104]:
cm = confusion_matrix(y_test2, predict_eclf2)
print(cm)
for i in range(len(cm)):
cm[i, :] = (((cm[i, :]) /(sum(cm[i, :]))) *100)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix _ esemble')
plt.colorbar()
outcomes = sorted(y_test2.unique())
tick_marks = np.arange(len(set(list(y_test2))))
plt.xticks(tick_marks, outcomes, rotation=45)
plt.yticks(tick_marks, outcomes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[104]:
In [156]:
eclf_kaggle2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
In [157]:
eclf_kaggle2.fit(X_dummy2, y2)
Out[157]:
In [158]:
result_predict2 = eclf_kaggle2.predict(X_test_dummy)
In [159]:
df_result = pd.DataFrame(columns=['ID','Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
df_result['ID'] = range(1, len(result_predict2)+1)
count = 1
for predict_val in result_predict2:
df_result.loc[df_result.ID==count, predict_val] = 1
count+=1
df_result = df_result.fillna(0)
df_result.index = df_result.ID
df_result = df_result.drop('ID', axis=1)
df_result.to_csv('submission3.csv')
In [1]:
In [ ]: